This notebook summarizes the findings from assigning cell type labels
to all ScPCA samples using SCimilarity.
Running this notebook requires the results from the
cell-type-consensus and cell-type-scimilarity
module in OpenScPCA-nf to be saved to
OpenScPCA-analysis/data/current/results.
suppressPackageStartupMessages({
# load required packages
library(ggplot2)
})
# Set default ggplot theme
theme_set(
theme_classic()
)
# Define color ramp for shared use in the heatmaps
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
# Set heatmap padding option
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(0.6, "in"))
# settings
options(
dplyr.summarise.inform = FALSE,
readr.show_col_types = FALSE
)
Functions
# function to read in both scimilarity and consensus results and combine into a single data frame
prep_data <- function(library_id, scimilarity_file, consensus_file){
scimilarity_df <- readr::read_tsv(scimilarity_file)
consensus_df <- readr::read_tsv(consensus_file)
annotation_df <- consensus_df |>
dplyr::left_join(scimilarity_df, by = c("barcodes" = "barcode")) |>
dplyr::mutate(
cell_id = glue::glue("{library_id}-{barcodes}"),
total_cells_per_library = dplyr::n()
)
return(annotation_df)
}
# Copied from scpca-nf template report
#' Format DT datatables
#'
#' @param df Data frame to format
#'
#' @return DT datatable
format_datatable <- function(df, caption) {
df |>
DT::datatable(
rownames = FALSE,
extensions = "Scroller",
caption = caption,
options = list(
scroller = TRUE,
deferRender = TRUE,
scrollX = TRUE,
scrollY = 400,
scrollCollapse = TRUE,
language = list(search = "Filter:"),
columnDefs = list(list(
# render missing values as "NA" rather than blanks in the table
targets = "_all",
render = DT::JS(
"function(data, type, row, meta) {",
"return data === null ? 'NA' : data;",
"}"
)
))
)
)
}
Data setup
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# results directory with cell-type-consensus
results_dir <- file.path(repository_base, "data", "current", "results")
consensus_results_dir <- file.path(results_dir, "cell-type-consensus")
scimilarity_results_dir <- file.path(results_dir, "cell-type-scimilarity")
# diagnoses table used for labeling plots from cell-type-consensus module
diagnoses_file <- file.path(repository_base, "analyses", "cell-type-consensus", "sample-info", "project-diagnoses.tsv")
# use the jaccard functions from cell-type-neuroblastoma-04 module
jaccard_functions <- file.path(repository_base, "analyses", "cell-type-neuroblastoma-04", "scripts", "utils", "jaccard-utils.R")
source(jaccard_functions)
# list all results files
consensus_results_files <- list.files(consensus_results_dir, pattern = "_processed_consensus-cell-types\\.tsv.\\gz$", recursive = TRUE, full.names = TRUE)
consensus_files_df <- data.frame(
library_id = stringr::word(basename(consensus_results_files), 1, sep = "_"),
consensus_file = consensus_results_files
)
scimilarity_results_files <- list.files(scimilarity_results_dir, pattern = "_processed_scimilarity-celltype-assignments\\.tsv.\\gz$", recursive = TRUE, full.names = TRUE)
scimilarity_files_df <- data.frame(
library_id = stringr::word(basename(scimilarity_results_files), 1, sep = "_"),
scimilarity_file = scimilarity_results_files
)
all_files_df <- scimilarity_files_df |>
dplyr::left_join(consensus_files_df, by = "library_id")
# define cell line projects to remove
cell_line_projects <- c("SCPCP000020", "SCPCP000024")
# check that everything has a matching consensus file
# we joined the consensus into the scimilarity so only libraries with scimilarity are included
if(sum(is.na(all_files_df$consensus_file)) > 0){
stop("Missing matching consensus cell type file for all libraries")
}
# read in diagnoses
diagnoses_df <- readr::read_tsv(diagnoses_file)
# read in all results as a single dataframe
results_df <- all_files_df |>
purrr::pmap(prep_data) |>
dplyr::bind_rows() |>
# remove cell line projects
dplyr::filter(!project_id %in% cell_line_projects) |>
# add in diagnoses
dplyr::left_join(diagnoses_df, by = "project_id") |>
dplyr::mutate(
# create a label for plotting
project_label = glue::glue("{project_id}:{diagnosis}")
)
SCimilarity results
Below we will summarize the results from SCimilarity on
all ScPCA projects.
Table of all cell types in ScPCA
First, we just look at the cell types identified and see if any of
the cells are not assigned.
sum(is.na(results_df$scimilarity_celltype_annotation))
[1] 0
It looks like every cell gets a cell type, so that’s something to
keep in mind when looking at these results.
Let’s see what cell types are presented in our data?
results_df |>
dplyr::ungroup() |>
dplyr::count(scimilarity_celltype_annotation, name = "total_cells") |>
dplyr::arrange(desc(total_cells)) |>
format_datatable(caption = "All samples")
So it looks like we have a lot of “native cell” labels which really
just means the cell is a cell and didn’t align with anything else in the
reference. This label is probably similar to our “other” label in
CellAssign.
Also, most of our cells are progenitor cells or hematopoietic stem
cells, which either means a lot of the cells are tumor cells and show
stem and progenitor like properties or these are from the blood tumors
which make up a large part of our atlas. Either way, we have cell
types!
Table of cell types per project
This section shows a table of all cell types identified in each
project. Each project will have its own table.
# get all the unique labels
project_labels <- unique(results_df$project_label)
tables <- project_labels |>
purrr::map(\(label) {
results_df |>
dplyr::filter(project_label == label) |>
dplyr::group_by(scimilarity_celltype_annotation) |>
dplyr::summarise(
total_cells = dplyr::n()
) |>
dplyr::arrange(desc(total_cells)) |>
format_datatable(caption = label)
})
htmltools::tagList(tables)
Comparison to existing consensus cell types
In this section, we compare the top 15 SCimilarity cell
type annotations to the consensus cell types determined using only
CellAssign and SingleR annotations for all
libraries in a single project. The rows correspond to consensus cell
types and the columns correspond to SCimilarity
annotations. All other cell types not in the top 10 represented cell
types in a project are grouped into the “All remaining cell types”
category.
results_df <- results_df |>
dplyr::group_by(project_id) |>
dplyr::mutate(
# get most frequently observed cell types across libraries in that project
scimilarity_celltype_lumped = forcats::fct_lump_n(scimilarity_celltype_annotation, 15, other_level = "All remaining cell types", ties.method = "first") |>
# sort by frequency
forcats::fct_infreq() |>
# make sure all remaining and unknown are last, use this to assign colors in specific order
forcats::fct_relevel("All remaining cell types", after = Inf) |>
as.character()
)
project_labels |>
purrr::walk(\(label){
results_df |>
dplyr::filter(project_label == label) |>
make_jaccard_heatmap(
annotation_col1 = "scimilarity_celltype_lumped",
annotation_col2 = "consensus_annotation",
label1 = glue::glue("{label} \nSCimilarity"),
label2 = "Consensus"
)
})





















As expected there is some direct agreement or places where more broad
consensus cell types are split up across more granular cell types in
SCimilarity. A few notable examples:
SCPCP000001: Cells labeled as macrophages in consensus
are split between glial and microglial cells in
SCimilarity. Natural killer cells match up between
consensus and SCimilarity.
SCPCP000006: Endothelial cells match up between
consensus and SCimilarity. The unknown cells in consensus
correspond to mesenchymal stem cells and a variety of kidney cells.
SCPCP000007: Mature T cell in consensus is split
between CD4 and CD8 T cells in SCimilarity.
We also see places where there is not agreement, which is to be
expected.
Conclusions
Of the 203 possible cell types in SCimilarity, 191 of
them are represented in ScPCA samples. There are quite a few scenarios
where consensus cell types “agree” with SCimilarity. It
will be informative to look at the new consensus cell types and how they
change with the addition of the SCimilarity annotations,
but in general the addition of SCimilarity seems quite
useful!
Session info
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.6.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Chicago
tzcode source: internal
attached base packages:
[1] stats graphics grDevices datasets utils methods base
other attached packages:
[1] ggplot2_3.5.2
loaded via a namespace (and not attached):
[1] tidyr_1.3.1 sass_0.4.10 generics_0.1.4 renv_1.1.4 shape_1.4.6.1 stringi_1.8.7 hms_1.1.3 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.5
[11] grid_4.4.2 RColorBrewer_1.1-3 iterators_1.0.14 circlize_0.4.16 fastmap_1.2.0 rprojroot_2.1.0 foreach_1.5.2 doParallel_1.0.17 jsonlite_2.0.0 GlobalOptions_0.1.2
[21] BiocManager_1.30.26 ComplexHeatmap_2.20.0 purrr_1.1.0 crosstalk_1.2.2 scales_1.4.0 codetools_0.2-20 jquerylib_0.1.4 cli_3.6.5 rlang_1.1.6 crayon_1.5.3
[31] bit64_4.6.0-1 withr_3.0.2 cachem_1.1.0 yaml_2.3.10 tools_4.4.2 parallel_4.4.2 tzdb_0.5.0 dplyr_1.1.4 colorspace_2.1-1 DT_0.34.0
[41] forcats_1.0.0 GetoptLong_1.0.5 BiocGenerics_0.50.0 vctrs_0.6.5 R6_2.6.1 png_0.1-8 matrixStats_1.5.0 stats4_4.4.2 lifecycle_1.0.4 stringr_1.5.1
[51] htmlwidgets_1.6.4 bit_4.6.0 S4Vectors_0.42.1 IRanges_2.38.1 vroom_1.6.5 clue_0.3-66 cluster_2.1.8.1 pkgconfig_2.0.3 pillar_1.11.0 bslib_0.9.0
[61] gtable_0.3.6 glue_1.8.0 xfun_0.53 tibble_3.3.0 tidyselect_1.2.1 knitr_1.50 farver_2.1.2 rjson_0.2.23 htmltools_0.5.8.1 rmarkdown_2.29
[71] readr_2.1.5 compiler_4.4.2
---
title: "Explore SCimilarity cell type annotations"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: hide
---

This notebook summarizes the findings from assigning cell type labels to all ScPCA samples using `SCimilarity`. 

Running this notebook requires the results from the `cell-type-consensus` and `cell-type-scimilarity` module in `OpenScPCA-nf` to be saved to `OpenScPCA-analysis/data/current/results`. 

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)

# Define color ramp for shared use in the heatmaps
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
# Set heatmap padding option
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(0.6, "in"))

# settings
options(
  dplyr.summarise.inform = FALSE, 
  readr.show_col_types = FALSE
)


```

## Functions


```{r}
# function to read in both scimilarity and consensus results and combine into a single data frame 
prep_data <- function(library_id, scimilarity_file, consensus_file){
  
  scimilarity_df <- readr::read_tsv(scimilarity_file)
  consensus_df <- readr::read_tsv(consensus_file)
  
  annotation_df <- consensus_df |> 
    dplyr::left_join(scimilarity_df, by = c("barcodes" = "barcode")) |> 
    dplyr::mutate(
      cell_id = glue::glue("{library_id}-{barcodes}"), 
      total_cells_per_library = dplyr::n()
    )
  
  return(annotation_df)
  
}
```

```{r}
# Copied from scpca-nf template report 
#' Format DT datatables
#'
#' @param df Data frame to format
#'
#' @return DT datatable
format_datatable <- function(df, caption) {
  df |>
    DT::datatable(
      rownames = FALSE,
      extensions = "Scroller",
      caption = caption,
      options = list(
        scroller = TRUE,
        deferRender = TRUE,
        scrollX = TRUE,
        scrollY = 400,
        scrollCollapse = TRUE,
        language = list(search = "Filter:"),
        columnDefs = list(list(
          # render missing values as "NA" rather than blanks in the table
          targets = "_all",
          render = DT::JS(
            "function(data, type, row, meta) {",
            "return data === null ? 'NA' : data;",
            "}"
          )
        ))
      )
    )
}
```



## Data setup


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# results directory with cell-type-consensus 
results_dir <- file.path(repository_base, "data", "current", "results")
consensus_results_dir <- file.path(results_dir, "cell-type-consensus")
scimilarity_results_dir <- file.path(results_dir, "cell-type-scimilarity")

# diagnoses table used for labeling plots from cell-type-consensus module
diagnoses_file <- file.path(repository_base, "analyses", "cell-type-consensus", "sample-info", "project-diagnoses.tsv")
```

```{r}
# use the jaccard functions from cell-type-neuroblastoma-04 module
jaccard_functions <- file.path(repository_base, "analyses", "cell-type-neuroblastoma-04", "scripts", "utils", "jaccard-utils.R")
source(jaccard_functions)
```


```{r}
# list all results files 
consensus_results_files <- list.files(consensus_results_dir, pattern = "_processed_consensus-cell-types\\.tsv.\\gz$", recursive = TRUE, full.names = TRUE)
consensus_files_df <- data.frame(
  library_id = stringr::word(basename(consensus_results_files), 1, sep = "_"),
  consensus_file = consensus_results_files
)

scimilarity_results_files <- list.files(scimilarity_results_dir, pattern = "_processed_scimilarity-celltype-assignments\\.tsv.\\gz$", recursive = TRUE, full.names = TRUE)
scimilarity_files_df <- data.frame(
  library_id = stringr::word(basename(scimilarity_results_files), 1, sep = "_"),
  scimilarity_file = scimilarity_results_files
)

all_files_df <- scimilarity_files_df |> 
  dplyr::left_join(consensus_files_df, by = "library_id")


# define cell line projects to remove
cell_line_projects <- c("SCPCP000020", "SCPCP000024")
```

```{r}
# check that everything has a matching consensus file 
# we joined the consensus into the scimilarity so only libraries with scimilarity are included 
if(sum(is.na(all_files_df$consensus_file)) > 0){
  stop("Missing matching consensus cell type file for all libraries")
}
```


```{r, message=FALSE, warning=FALSE}
# read in diagnoses
diagnoses_df <- readr::read_tsv(diagnoses_file)

# read in all results as a single dataframe
results_df <- all_files_df |> 
  purrr::pmap(prep_data) |> 
  dplyr::bind_rows() |> 
  # remove cell line projects
  dplyr::filter(!project_id %in% cell_line_projects) |> 
  # add in diagnoses 
  dplyr::left_join(diagnoses_df, by = "project_id") |> 
  dplyr::mutate(
    # create a label for plotting
    project_label = glue::glue("{project_id}:{diagnosis}")
  )
```

## `SCimilarity` results 

Below we will summarize the results from `SCimilarity` on all ScPCA projects. 

### Table of all cell types in ScPCA 

First, we just look at the cell types identified and see if any of the cells are not assigned. 

```{r}
sum(is.na(results_df$scimilarity_celltype_annotation))
```

It looks like every cell gets a cell type, so that's something to keep in mind when looking at these results. 

Let's see what cell types are presented in our data? 

```{r}
results_df |> 
  dplyr::ungroup() |> 
  dplyr::count(scimilarity_celltype_annotation, name = "total_cells") |> 
  dplyr::arrange(desc(total_cells)) |> 
      format_datatable(caption = "All samples")
```

So it looks like we have a lot of "native cell" labels which really just means the cell is a cell and didn't align with anything else in the reference. 
This label is probably similar to our "other" label in `CellAssign`. 

Also, most of our cells are progenitor cells or hematopoietic stem cells, which either means a lot of the cells are tumor cells and show stem and progenitor like properties or these are from the blood tumors which make up a large part of our atlas.
Either way, we have cell types! 

### Table of cell types per project 

This section shows a table of all cell types identified in each project. 
Each project will have its own table. 

```{r}
# get all the unique labels
project_labels <- unique(results_df$project_label)

tables <- project_labels |> 
  purrr::map(\(label) {
    
    results_df |> 
      dplyr::filter(project_label == label) |> 
      dplyr::group_by(scimilarity_celltype_annotation) |> 
      dplyr::summarise(
        total_cells = dplyr::n()
      ) |> 
      dplyr::arrange(desc(total_cells)) |> 
      format_datatable(caption = label)
    
  })

htmltools::tagList(tables)
```

### Comparison to existing consensus cell types 

In this section, we compare the top 15 `SCimilarity` cell type annotations to the consensus cell types determined using only `CellAssign` and `SingleR` annotations for all libraries in a single project. 
The rows correspond to consensus cell types and the columns correspond to `SCimilarity` annotations. 
All other cell types not in the top 10 represented cell types in a project are grouped into the "All remaining cell types" category. 

```{r}
results_df <- results_df |> 
    dplyr::group_by(project_id) |> 
    dplyr::mutate(
      # get most frequently observed cell types across libraries in that project 
      scimilarity_celltype_lumped = forcats::fct_lump_n(scimilarity_celltype_annotation, 15, other_level = "All remaining cell types", ties.method = "first") |> 
        # sort by frequency 
        forcats::fct_infreq() |> 
        # make sure all remaining and unknown are last, use this to assign colors in specific order
        forcats::fct_relevel("All remaining cell types", after = Inf) |> 
        as.character()
    )
```


```{r, fig.height = 10, fig.width = 10}
project_labels |> 
  purrr::walk(\(label){
    
    results_df |> 
      dplyr::filter(project_label == label) |> 
      make_jaccard_heatmap(
        annotation_col1 = "scimilarity_celltype_lumped",
        annotation_col2 = "consensus_annotation",
        label1 = glue::glue("{label} \nSCimilarity"),
        label2 = "Consensus"
      )
    
  })
```

As expected there is some direct agreement or places where more broad consensus cell types are split up across more granular cell types in `SCimilarity`. 
A few notable examples: 

- `SCPCP000001`: Cells labeled as macrophages in consensus are split between glial and microglial cells in `SCimilarity`. 
Natural killer cells match up between consensus and `SCimilarity`. 
- `SCPCP000006`: Endothelial cells match up between consensus and `SCimilarity`. 
The unknown cells in consensus correspond to mesenchymal stem cells and a variety of kidney cells. 
- `SCPCP000007`: Mature T cell in consensus is split between CD4 and CD8 T cells in `SCimilarity`. 

We also see places where there is not agreement, which is to be expected.

## Conclusions

Of the 203 possible cell types in `SCimilarity`, 191 of them are represented in ScPCA samples. 
There are quite a few scenarios where consensus cell types "agree" with `SCimilarity`. 
It will be informative to look at the new consensus cell types and how they change with the addition of the `SCimilarity` annotations, but in general the addition of `SCimilarity` seems quite useful!

## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```


